import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# data processing/manipulation
pd.options.mode.chained_assignment = None
import re
# data visualization
import matplotlib.pyplot as plt
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.express as px
# stopwords, tokenizer, stemmer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
# spell correction, lemmatization
from textblob import TextBlob
from textblob import Word
# sklearn
from sklearn.model_selection import train_test_split
trump_df = pd.read_csv('D:\BANSummer2021\BAN200Sentiment Analysis and Text Mining\AssignmentGroup\hashtag_donaldtrump.csv', lineterminator='\n')
biden_df = pd.read_csv('D:\BANSummer2021\BAN200Sentiment Analysis and Text Mining\AssignmentGroup\hashtag_joebiden.csv', lineterminator='\n')
trump_df.head(10)
# Looking at the shape of trump_df (rows, columns)
trump_df.shape
# Getting some insights on trump_df dataset (with describe())
trump_df.describe()
# More insights on trump_dfset with info()
trump_df.info()
biden_df.head(10)
# Looking at the shape of biden_df (rows, columns)
biden_df.shape
# Getting some insights on biden_df dataset with describe()
biden_df.describe()
# More insights on trump_dfset with info()
biden_df.info()
# pembersihan data
trump_df = trump_df.drop(columns=['tweet_id','user_id','user_name','user_screen_name','user_description','user_join_date','collected_at'])
biden_df = biden_df.drop(columns=['tweet_id','user_id','user_name','user_screen_name','user_description','user_join_date','collected_at'])
# pembagian kolom data
trump_df = trump_df.rename(columns={"likes": "Likes", "retweet_count": "Retweets",
"state": "State", "user_followers_count": "Followers"})
biden_df = biden_df.rename(columns={"likes": "Likes", "retweet_count": "Retweets",
"state": "State", "user_followers_count": "Followers"})
# upload data nama kota
d = {"United States of America":"United States"}
trump_df['country'].replace(d, inplace=True)
biden_df['country'].replace(d, inplace=True)
trump_df = trump_df.loc[trump_df['country'] == "United States"]
biden_df = biden_df.loc[biden_df['country'] == "United States"]
# hapuskan garis null
trump_df = trump_df.dropna()
biden_df = biden_df.dropna()
to_remove = r'\d+|http?\S+|[^A-Za-z0-9]+'
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()
# Function to preprocess tweet
def clean_tweet(tweet, stem=False, lemmatize=False):
# Make all text lowercase
tweet = tweet.lower()
# Remove links, special characters, punctuation, numbers, etc.
tweet = re.sub(to_remove, ' ', tweet)
filtered_tweet = []
words = word_tokenize(tweet)
# Remove stopwords and stem
for word in words:
if not word in stop_words:
if stem:
filtered_tweet.append(ps.stem(word))
elif lemmatize:
filtered_tweet.append(Word(word).lemmatize())
else:
filtered_tweet.append(word)
return filtered_tweet
# Filtering all trump and biden tweets by applying cleantweet()
trump_df['tweetNew'] = trump_df.tweet.apply(lambda x: clean_tweet(x))
biden_df['tweetNew'] = biden_df.tweet.apply(lambda x: clean_tweet(x))
# Looking at 5 trump tweets after filtering
trump_df['tweetNew'].head()
# Looking at 5 trump tweets after filtering
biden_df['tweetNew'].head()
# Function to perform sentitment analysis on trump and biden dataframes
def sentiment_analysis(df):
# Determine polarity and subjectivity
df['Polarity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)
df['Subjectivity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.subjectivity)
# Classify overall sentiment
df.loc[df.Polarity > 0,'Sentiment'] = 'positive'
df.loc[df.Polarity == 0,'Sentiment'] = 'neutral'
df.loc[df.Polarity < 0,'Sentiment'] = 'negative'
return df[['tweet','tweetNew','Polarity','Subjectivity','Sentiment']].head()
# Perform sentiment analysis on trump_df
sentiment_analysis(trump_df)
# Perform sentiment analysis on biden_df
sentiment_analysis(biden_df)
# Overall sentiment breakdown - Trump
print("Trump Tweet Sentiment Breakdown")
trump_positive = len(trump_df.loc[trump_df.Sentiment=='positive'])
trump_neutral = len(trump_df.loc[trump_df.Sentiment=='neutral'])
trump_negative = len(trump_df.loc[trump_df.Sentiment=='negative'])
print("Number of Positive Tweets: ", trump_positive)
print("Number of Neutral Tweets: ", trump_neutral)
print("Number of Negative Tweets: ", trump_negative)
# Graphing the number of trump tweets by sentiment
data_t = {'Positive':trump_positive,'Neutral':trump_neutral,'Negative':trump_negative}
sentiment_t = list(data_t.keys())
num_tweets_t = list(data_t.values())
plt.figure(figsize = (8, 5))
plt.bar(sentiment_t, num_tweets_t, color ='red', width = 0.5, edgecolor='black',)
plt.xlabel("Sentiment", fontweight ='bold')
plt.ylabel("Number of Tweets", fontweight ='bold')
plt.title("Trump Tweets by Sentiment", fontweight ='bold')
plt.show()
# Overall sentiment breakdown - Biden
print("Biden Tweet Sentiment Breakdown")
biden_positive = len(biden_df.loc[biden_df.Sentiment=='positive'])
biden_neutral = len(biden_df.loc[biden_df.Sentiment=='neutral'])
biden_negative = len(biden_df.loc[biden_df.Sentiment=='negative'])
print("Number of Positive Tweets: ", biden_positive)
print("Number of Neutral Tweets: ", biden_neutral)
print("Number of Negative Tweets: ", biden_negative)
# Graphing the number of biden tweets by sentiment
data_b = {'Positive':biden_positive,'Neutral':biden_neutral,'Negative':biden_negative}
sentiment_b = list(data_b.keys())
num_tweets_b = list(data_b.values())
plt.figure(figsize = (8, 5))
plt.bar(sentiment_b, num_tweets_b, color ='blue', width = 0.5, edgecolor='black')
plt.xlabel("Sentiment", fontweight ='bold')
plt.ylabel("Number of Tweets", fontweight ='bold')
plt.title("Biden Tweets by Sentiment", fontweight ='bold')
plt.show()
# Calculate relative percentages by sentiment - Trump
total_tweets_t = len(trump_df.Sentiment)
prop_tweets_t = list(map(lambda x: round(x/total_tweets_t,2), num_tweets_t))
# Calculate relative percentages by sentiment - Biden
total_tweets_b = len(biden_df.Sentiment)
prop_tweets_b = list(map(lambda x: round(x/total_tweets_b,2), num_tweets_b))
# Graphing relative percentages of both trump and biden tweets
bar_width = 0.25
plt.subplots(figsize=(8,8))
br1 = np.arange(3)
br2 = [x + bar_width for x in br1]
t = plt.bar(br1, prop_tweets_t, color ='r', width = bar_width,
edgecolor ='black', label ='Trump')
b = plt.bar(br2, prop_tweets_b, color ='b', width = bar_width,
edgecolor ='black', label ='Biden')
plt.xlabel('Sentiment',fontweight ='bold')
plt.ylabel('Percentage of Tweets',fontweight ='bold')
plt.xticks([r + bar_width/2 for r in range(3)],['Positive','Neutral','Negative'])
plt.legend([t,b],['Percentage of Trump Tweets','Percentage of Biden Tweets'])
plt.ylim(0.0, 1.0)
plt.title('Proportions of Tweets By Sentiment',fontweight ='bold')
plt.show()
# Function to return a string of all words in all tweets
def get_all_tweets(df,by_sentiment=False,sentiment="positive"):
words = ' '.join(df['tweetNew'].apply(lambda x: ' '.join(x)))
return words
# Create word strings
words_trump = get_all_tweets(trump_df)
words_biden = get_all_tweets(biden_df)
# Tokenize word strings
tokens_trump = word_tokenize(words_trump)
tokens_biden = word_tokenize(words_biden)
# Function to generate word cloud
def create_wordcloud(words):
# create wordcloud
wordcloud = WordCloud(max_font_size=200, max_words=200,
background_color="white").generate(words)
# display the generated image
plt.figure(1,figsize=(13, 13))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# Generate word cloud of trump tweets
create_wordcloud(words_trump)
# Generate word cloud of biden tweets
create_wordcloud(words_biden)
# Average polarity by state (trump)
trump_state_polarity = trump_df.groupby("State",as_index=False).mean()
fig = px.bar(trump_state_polarity, x="State", y="Polarity",
title="<b>Average Polarity of Trump-Related Tweets by State</b>")
fig.update_traces(marker=dict(color="red"),selector=dict(type="bar"),
marker_line_color='black', marker_line_width=0.8, opacity=0.6)
fig.show()
biden_state_polarity = biden_df.groupby("State",as_index=False).mean()
fig = px.bar(biden_state_polarity, x="State", y="Polarity",
title="<b>Average Polarity of Biden-Related Tweets by State</b>")
fig.update_traces(marker=dict(color="blue"),selector=dict(type="bar"),
marker_line_color='black', marker_line_width=0.8, opacity=0.6)
fig.show()